[FRAUD] 데이터정리 시도(matrix로 lesson6따라하기 - 실패ㅎ)

Author

김보람

Published

August 22, 2023

imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import networkx as nx
import sklearn
import torch

# sklearn
from sklearn import model_selection # split함수이용
from sklearn import ensemble # RF,GBM
from sklearn import metrics 

# embedding 
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder
def build_graph_bipartite(df_input, graph_type=nx.Graph()):
    df=df_input.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df["cc_num"].values.tolist()+\
                                                      df["merchant"].values.tolist()))}
    
    df["from"]=df["cc_num"].apply(lambda x:mapping[x])  #엣지의 출발점
    df["to"]=df["merchant"].apply(lambda x:mapping[x])  #엣지의 도착점
    
    df = df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index()
    df["is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0)
    
    G=nx.from_edgelist(df[["from","to"]].values, create_using=graph_type)
    
    nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label")  #엣지 속성 설정,각 속성의 사기 여부부     
    nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight") # 엣지 속성 설정, 각 엣지의 거래 금액

    return G


def build_graph_tripartite(df_input, graph_type=nx.Graph()):
    df=df_input.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df.index.values.tolist() + 
                                                       df["cc_num"].values.tolist() +
                                                       df["merchant"].values.tolist()))}
    df["in_node"]= df["cc_num"].apply(lambda x: mapping[x])
    df["out_node"]=df["merchant"].apply(lambda x:mapping[x])
    
        
    G=nx.from_edgelist([(x["in_node"], mapping[idx]) for idx, x in df.iterrows()] +\
                        [(x["out_node"], mapping[idx]) for idx, x in df.iterrows()], create_using=graph_type)
    
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")     
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")   
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")  
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")

    return G
    
    
def down_sample_textbook(df):
    df_majority = df[df.is_fraud==0].copy()
    df_minority = df[df.is_fraud==1].copy()
    df_maj_dowsampled = sklearn.utils.resample(df_majority, n_samples=len(df_minority), replace=False, random_state=42)
    df_downsampled = pd.concat([df_minority, df_maj_dowsampled])
    return df_downsampled

def embedding(Graph):
    # Graph -> X (feature)
    _edgs = list(Graph.edges)
    subGraph = Graph.edge_subgraph([_edgs[x] for x in range(len(Graph.edges))]).copy()
    subGraph.add_nodes_from(list(set(Graph.nodes) - set(subGraph.nodes)))    
    embedded = AverageEmbedder(Node2Vec(subGraph, weight_key='weight').fit(window=10).wv)
    X = [embedded[str(_edgs[x][0]), str(_edgs[x][1])] for x in range(len(Graph.edges))]
    # Graph -> y (label)
    y = np.array(list(nx.get_edge_attributes(Graph, "label").values()))
    return X,y 

def anal(df):
    Graph = build_graph_bipartite(df)
    X,XX,y,yy = embedding(Graph)
    lrnr = RandomForestClassifier(n_estimators=100, random_state=42) 
    lrnr.fit(X,y)
    yyhat = lrnr.predict(XX)
    df = pd.DataFrame({
        'acc':[sklearn.metrics.accuracy_score(yy,yyhat)], 
        'pre':[sklearn.metrics.precision_score(yy,yyhat)], 
        'rec':[sklearn.metrics.recall_score(yy,yyhat)],
        'f1':[sklearn.metrics.f1_score(yy,yyhat)]}
    )    
    return df

def our_sampling1(df):
    cus_list = set(df.query('is_fraud==1').cc_num.tolist())
    return df.query("cc_num in @ cus_list")
fraudTrain = pd.read_csv("~/Desktop/fraudTrain.csv").iloc[:,1:]
fraudTrain = fraudTrain.assign(trans_date_trans_time= list(map(lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time)))
fraudTrain
trans_date_trans_time cc_num merchant category amt first last gender street city ... lat long city_pop job dob trans_num unix_time merch_lat merch_long is_fraud
0 2019-01-01 00:00:00 2.703190e+15 fraud_Rippin, Kub and Mann misc_net 4.97 Jennifer Banks F 561 Perry Cove Moravian Falls ... 36.0788 -81.1781 3495 Psychologist, counselling 1988-03-09 0b242abb623afc578575680df30655b9 1325376018 36.011293 -82.048315 0
1 2019-01-01 00:00:00 6.304230e+11 fraud_Heller, Gutmann and Zieme grocery_pos 107.23 Stephanie Gill F 43039 Riley Greens Suite 393 Orient ... 48.8878 -118.2105 149 Special educational needs teacher 1978-06-21 1f76529f8574734946361c461b024d99 1325376044 49.159047 -118.186462 0
2 2019-01-01 00:00:00 3.885950e+13 fraud_Lind-Buckridge entertainment 220.11 Edward Sanchez M 594 White Dale Suite 530 Malad City ... 42.1808 -112.2620 4154 Nature conservation officer 1962-01-19 a1a22d70485983eac12b5b88dad1cf95 1325376051 43.150704 -112.154481 0
3 2019-01-01 00:01:00 3.534090e+15 fraud_Kutch, Hermiston and Farrell gas_transport 45.00 Jeremy White M 9443 Cynthia Court Apt. 038 Boulder ... 46.2306 -112.1138 1939 Patent attorney 1967-01-12 6b849c168bdad6f867558c3793159a81 1325376076 47.034331 -112.561071 0
4 2019-01-01 00:03:00 3.755340e+14 fraud_Keeling-Crist misc_pos 41.96 Tyler Garcia M 408 Bradley Rest Doe Hill ... 38.4207 -79.4629 99 Dance movement psychotherapist 1986-03-28 a41d7549acf90789359a9aa5346dcb46 1325376186 38.674999 -78.632459 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1048570 2020-03-10 16:07:00 6.011980e+15 fraud_Fadel Inc health_fitness 77.00 Haley Wagner F 05561 Farrell Crescent Annapolis ... 39.0305 -76.5515 92106 Accountant, chartered certified 1943-05-28 45ecd198c65e81e597db22e8d2ef7361 1362931649 38.779464 -76.317042 0
1048571 2020-03-10 16:07:00 4.839040e+15 fraud_Cremin, Hamill and Reichel misc_pos 116.94 Meredith Campbell F 043 Hanson Turnpike Hedrick ... 41.1826 -92.3097 1583 Geochemist 1999-06-28 c00ce51c6ebb7657474a77b9e0b51f34 1362931670 41.400318 -92.726724 0
1048572 2020-03-10 16:08:00 5.718440e+11 fraud_O'Connell, Botsford and Hand home 21.27 Susan Mills F 005 Cody Estates Louisville ... 38.2507 -85.7476 736284 Engineering geologist 1952-04-02 17c9dc8b2a6449ca2473726346e58e6c 1362931711 37.293339 -84.798122 0
1048573 2020-03-10 16:08:00 4.646850e+18 fraud_Thompson-Gleason health_fitness 9.52 Julia Bell F 576 House Crossroad West Sayville ... 40.7320 -73.1000 4056 Film/video editor 1990-06-25 5ca650881b48a6a38754f841c23b77ab 1362931718 39.773077 -72.213209 0
1048574 2020-03-10 16:08:00 2.283740e+15 fraud_Buckridge PLC misc_pos 6.81 Shannon Williams F 9345 Spencer Junctions Suite 183 Alpharetta ... 34.0770 -84.3033 165556 Prison officer 1997-12-27 8d0a575fe635bbde12f1a2bffc126731 1362931730 33.601468 -83.891921 0

1048575 rows × 22 columns

시도

_df1 = fraudTrain[fraudTrain["is_fraud"] == 0].sample(frac=0.20, random_state=42)
_df2 = fraudTrain[fraudTrain["is_fraud"] == 1]
df02 = pd.concat([_df1,_df2])
df02.shape
(214520, 22)
df50 = down_sample_textbook(df02)
df50.shape
(12012, 22)
12012*12012
144288144
df50 = df50.reset_index()
N = len(df50)

tr/test

df50_tr,df50_test = sklearn.model_selection.train_test_split(df50, random_state=42)
df50_tr.is_fraud.mean().round(5), df50_test.is_fraud.mean().round(5)
(0.49828, 0.50516)
df50_tr.shape, df50_test.shape
((9009, 23), (3003, 23))
train_mask = np.concatenate((np.full(9009, True), np.full(3003, False)))
test_mask = np.concatenate((np.full(9009, False), np.full(3003, True)))
print("Train Mask:", train_mask)
print("Test Mask:", test_mask)
Train Mask: [ True  True  True ... False False False]
Test Mask: [False False False ...  True  True  True]
train_mask.shape, test_mask.shape
((12012,), (12012,))
train_mask.sum(), test_mask.sum()
(9009, 3003)
df50_com = pd.concat([df50_tr, df50_test])
df50_com = df50_com.reset_index()

aj_matrix

# edge_index_list2_com = []
# for i in range(N):
#     for j in range(N):
#         if df50_com['cc_num'][i] != df50_com['cc_num'][j]:  
#             edge = 0
#         else:
#             edge = 1
#         edge_index_list2_com.append([i, j, edge])
#np.save('edge_index_list2_50_com.npy', edge_index_list2_com)

edge_index_list2_com = np.load('edge_index_list2_50_com.npy')
# edge_index_list2_com
array([[    0,     0,     1],
       [    0,     1,     0],
       [    0,     2,     0],
       ...,
       [12011, 12009,     0],
       [12011, 12010,     0],
       [12011, 12011,     1]])
edge_index_list2_com.shape
(144288144, 3)
num_nodes = 12012
aj_matrix = np.zeros((num_nodes, num_nodes))
aj_matrix
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])
for i, j ,edge in edge_index_list2_com:
    aj_matrix[i][j] = edge
aj_matrix
array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])
# aj_matrix.shape
(12012, 12012)
# np.save('aj_matrix.npy', aj_matrix)

# aj_matrix = np.load('aj_matrix.npy')
# aj_matrix
array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])


weigt matirx

# edge_index_list = []
# for i in range(N):
#     for j in range(N):
#         time_difference = (df50_com['trans_date_trans_time'][i] - df50_com['trans_date_trans_time'][j]).total_seconds()
#         edge_index_list.append([i, j, time_difference])
# np.save('edge_index_list_50_com.npy', edge_index_list)

edge_index_list = np.load('edge_index_list_50_com.npy')
edge_index_list[:5]
array([[ 0.000000e+00,  0.000000e+00,  0.000000e+00],
       [ 0.000000e+00,  1.000000e+00, -2.030190e+07],
       [ 0.000000e+00,  2.000000e+00, -2.841396e+07],
       [ 0.000000e+00,  3.000000e+00, -2.383788e+07],
       [ 0.000000e+00,  4.000000e+00, -2.687796e+07]])
edge_index = np.array(edge_index_list)
edge_index[:,2] = np.abs(edge_index[:,2])
theta = edge_index[:,2].mean()
theta
12238996.895508753
edge_index[:,2] = (np.exp(-edge_index[:,2]/theta)!=1) * np.exp(-edge_index[:,2]/theta)
edge_index
array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 1.00000000e+00, 1.90369587e-01],
       [0.00000000e+00, 2.00000000e+00, 9.81172367e-02],
       ...,
       [1.20110000e+04, 1.20090000e+04, 9.25720620e-01],
       [1.20110000e+04, 1.20100000e+04, 5.15585903e-01],
       [1.20110000e+04, 1.20110000e+04, 0.00000000e+00]])

w_matrix로 바꾸려고 하니까 형식이 [i][j]가 맞지 않는다.!

# # 출력 형식 변경
# np.set_printoptions(formatter={'int': '{:d}'.format})
# # 원래 출력 형식으로 복원
# np.set_printoptions(formatter=None)
w_matrix = np.zeros((num_nodes, num_nodes))
w_matrix
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])
for i, j ,time_difference in edge_index:
    i, j = int(i), int(j)
    w_matrix[i][j] = time_difference
w_matrix
array([[0.        , 0.19036959, 0.09811724, ..., 0.29671829, 0.14162023,
        0.27467824],
       [0.19036959, 0.        , 0.51540395, ..., 0.6415836 , 0.74392254,
        0.69306396],
       [0.09811724, 0.51540395, 0.        , ..., 0.33067472, 0.69281937,
        0.3572079 ],
       ...,
       [0.29671829, 0.6415836 , 0.33067472, ..., 0.        , 0.4772885 ,
        0.92572062],
       [0.14162023, 0.74392254, 0.69281937, ..., 0.4772885 , 0.        ,
        0.5155859 ],
       [0.27467824, 0.69306396, 0.3572079 , ..., 0.92572062, 0.5155859 ,
        0.        ]])
w_matrix.shape
(12012, 12012)
np.save('w_matrix.npy', w_matrix)

# np.save('edge_index_list_plus.npy', edge_index_list_plus)

edge_index_list_plus = np.load('edge_index_list_plus.npy')
edge_index = np.array(edge_index_list_plus)
edge_index.shape
(144288144, 3)
edge_index
array([[0.0000e+00, 0.0000e+00, 0.0000e+00],
       [0.0000e+00, 1.0000e+00, 0.0000e+00],
       [0.0000e+00, 2.0000e+00, 0.0000e+00],
       ...,
       [1.2011e+04, 1.2009e+04, 0.0000e+00],
       [1.2011e+04, 1.2010e+04, 0.0000e+00],
       [1.2011e+04, 1.2011e+04, 0.0000e+00]])
edge_index[:,2] = np.abs(edge_index[:,2])
theta = edge_index[:,2].mean()
theta
10973.519989002007
edge_index[:,2] = (np.exp(-edge_index[:,2]/theta)!=1) * np.exp(-edge_index[:,2]/theta)
edge_index
array([[0.0000e+00, 0.0000e+00, 0.0000e+00],
       [0.0000e+00, 1.0000e+00, 0.0000e+00],
       [0.0000e+00, 2.0000e+00, 0.0000e+00],
       ...,
       [1.2011e+04, 1.2009e+04, 0.0000e+00],
       [1.2011e+04, 1.2010e+04, 0.0000e+00],
       [1.2011e+04, 1.2011e+04, 0.0000e+00]])
edge_index_list_updated = edge_index.tolist()
mm = np.array(edge_index_list_updated)[:,2].mean()
selected_edges = [(int(row[0]), int(row[1])) for row in edge_index_list_updated if row[2] > mm]
edge_index_selected = torch.tensor(selected_edges, dtype=torch.long).t()
edge_index_selected.shape
torch.Size([2, 51392])
np.save('edge_index_selected.npy', edge_index_selected)

pyg lesson6 따라하기

data설정(x, edge_index, y)

x = torch.tensor([df50_com['amt']], dtype=torch.float).reshape(-1,1)
x
tensor([[921.2400],
        [698.2800],
        [220.5600],
        ...,
        [ 17.9700],
        [  7.5800],
        [824.9900]])
y = torch.tensor(df50_com['is_fraud'],dtype = torch.int64)
y
tensor([1, 1, 0,  ..., 1, 0, 1])
import torch_geometric
data = torch_geometric.data.Data(x=x, edge_index = edge_index_selected, y=y)

#train_mask = train_mask, test_mask = test_mask
data
Data(x=[12012, 1], edge_index=[2, 51392], y=[12012])

GCNConv

gconv = torch_geometric.nn.GCNConv(1,4)
gconv
GCNConv(1, 4)
gconv(data.x, data.edge_index)
tensor([[-5.1237e+02,  5.3152e+02, -2.9626e+01,  5.3703e+02],
        [-4.2507e+02,  4.4096e+02, -2.4578e+01,  4.4553e+02],
        [-1.9991e+02,  2.0738e+02, -1.1559e+01,  2.0953e+02],
        ...,
        [-3.8459e+02,  3.9897e+02, -2.2238e+01,  4.0310e+02],
        [-6.8703e+00,  7.1271e+00, -3.9725e-01,  7.2010e+00],
        [-5.2357e+02,  5.4314e+02, -3.0273e+01,  5.4877e+02]],
       grad_fn=<AddBackward0>)
list(gconv.parameters())
[Parameter containing:
 tensor([0., 0., 0., 0.], requires_grad=True),
 Parameter containing:
 tensor([[-0.9064],
         [ 0.9403],
         [-0.0524],
         [ 0.9500]], requires_grad=True)]
_,W = list(gconv.parameters())
W
Parameter containing:
tensor([[-0.9064],
        [ 0.9403],
        [-0.0524],
        [ 0.9500]], requires_grad=True)
A = torch.tensor(aj_matrix, dtype=torch.float32)
Atilde = A + torch.eye(12012)
Atilde
tensor([[2., 0., 0.,  ..., 0., 0., 0.],
        [0., 2., 0.,  ..., 0., 0., 0.],
        [0., 0., 2.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 2., 0., 0.],
        [0., 0., 0.,  ..., 0., 2., 0.],
        [0., 0., 0.,  ..., 0., 0., 2.]])

엇? 자기자신은 = 0 을 .. 넣는게 여기든가!

Atilde@data.x@W.T/3, gconv(data.x,data.edge_index)
(tensor([[-2327.1099,  2414.0901,  -134.5556,  2439.1140],
         [-2175.9143,  2257.2434,  -125.8134,  2280.6416],
         [ -344.8129,   357.7009,   -19.9374,   361.4088],
         ...,
         [ -647.5668,   671.7708,   -37.4429,   678.7343],
         [-2294.9517,  2380.7297,  -132.6962,  2405.4080],
         [-3068.6165,  3183.3123,  -177.4302,  3216.3098]],
        grad_fn=<DivBackward0>),
 tensor([[-5.1237e+02,  5.3152e+02, -2.9626e+01,  5.3703e+02],
         [-4.2507e+02,  4.4096e+02, -2.4578e+01,  4.4553e+02],
         [-1.9991e+02,  2.0738e+02, -1.1559e+01,  2.0953e+02],
         ...,
         [-3.8459e+02,  3.9897e+02, -2.2238e+01,  4.0310e+02],
         [-6.8703e+00,  7.1271e+00, -3.9725e-01,  7.2010e+00],
         [-5.2357e+02,  5.4314e+02, -3.0273e+01,  5.4877e+02]],
        grad_fn=<AddBackward0>))

A를 선택하는거에 있어서 생각해보니 ,,,,,,, 잘못했다! 계속 그냥. . edge만 하는데 GConv에서는 weight를 통해서 edge를 골랐는데 .. 그럼 여기서도 그 mm값을 통해서 구해야할거같은데……이건 어떻게 계산하는 거지?